home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Skunkware 5
/
Skunkware 5.iso
/
src
/
Tools
/
glimpsehttp
/
cgi-bin
/
aglimpse
next >
Wrap
Text File
|
1995-05-16
|
8KB
|
309 lines
#!/usr/local/bin/perl
#
# Acknowledgements
#
# Thanks to Guy Brooker (guy@jw.estec.esa.nl) for his AA interface,
# which was the starting point for this program.
#
# Paul Clark
# paul@cs.arizona.edu
#
# Modifications
#
# 2/22/94 Version 1.0, shell script version Paul Clark
# 4/21/94 Version 1.1, multiple archives support Paul Clark
# 4/22/94 Version 1.2, perl script Paul Clark
# 8/05/94 Version 1.3, verbosity&security Paul Clark
#10/05/94 Version 1.4, more security, improved
# output Paul Clark
# **** **** **** **** CONFIGURABLE VARIABLES **** **** **** ****
$HTTPD_HOME="/usr1/paul/httpd" ;
$HTTPD_NEWSHOME="/usr1/paul/news" ;
$GLIMPSE_LOC="/usr/paul/bin/glimpse" ;
$CONVERT="$HTTPD_HOME/wwwlib/cvtwww" ;
$FSSERV="/cgi-bin/mfs" ;
# Set file name pattern where to suppress HTML tags
# Comment out to cancel suppression
# Currently set to "only filenames ending with '.html'"
$SUPPRESS_HTML_TAGS = "\\.html\$";
# **** **** **** **** NO CONFIGURATION NEEDED BELOW **** **** **** ****
$_ = $ENV{'PATH_INFO'};
if ( m|^/([0-9][0-9]*)(.*)$| ) {
$script = $1;
$path = $2;
$path =~ s|"||g;
} else {
&err_noscript;
}
open(AMGRCONF,"$HTTPD_HOME/wwwlib/amgr.cfg") || &err_conf;
undef $indexdir;
line: while (<AMGRCONF>) {
@_ = split(/\t/);
if ( $_[3] eq $script ) {
$indexdir = $_[0];
$urlpath = $_[1];
last line;
}
}
&err_noscript unless $indexdir;
close(AMGRCONF);
($ENV{'HOME'} = $indexdir) || &err_noscript; # some versions of Glimpse need it
# Ensure that Glimpse is available on this machine
-x $GLIMPSE_LOC || &err_noglimpse ;
# Ensure that index is available
-r "$indexdir/.glimpse_index" || &err_noindex($indexdir) ;
# To support an ISINDEX type search, set query string if given
# an argument on the command line
$prefix="whole=on&case=on&query=" if ( $#ARGV >= 0 );
# Check that a query has been made
($query = $ENV{'QUERY_STRING'}) || &err_noquery ;
# Strip the variables out from the query string,
# and assign them into variables, prefixed by 'QS_'
@qvars = split( /\&/, $prefix . $query );
foreach (@qvars) {
split(/=/);
$fname = $_[0];
$fvalue = $_[1];
$fvalue =~ s/\'//g;
$cmd = "\$QS_$fname = '$fvalue';" ;
# print ">>>",$cmd,"\n";
$cmd = eval $cmd if ( $fname =~ /^[a-z_A-Z]\w*$/ );
}
$QS_query =~ s|\+| |g;
$QS_query =~ s|%(\w\w)|sprintf("%c", hex($1))|ge;
$pquery = $QS_query;
$QS_query =~ s|\'|\'\"\'\"\'|g;
$OPT_errors="-$QS_errors" if $QS_errors =~ /^[0-8]$/;
$OPT_errors="-B" if $QS_errors =~ /^Best\+match$/;
$OPT_case="-i" if $QS_case =~ /^on$/;
$OPT_whole="-w" unless $QS_whole =~ /^on$/;
$path =~ s/\./\\./;
$path =~ s/\'//g;
$OPT_filter="-F '$path'" if $path;
if ($QS_maxlines =~ /\d+/) {
$maxlines = $&;
} else {
$maxlines = 20;
}
if ($QS_maxfiles =~ /\d+/) {
$maxfiles = $&;
} else {
$maxfiles = 100;
}
$highlight = $QS_query;
$highlight =~ s/^\W+//;
$highlight = join("|",split(/\W+/,$highlight));
# check if the query contains any words
&err_badquery if !$highlight;
$highlight = '\b('.$highlight.')\b' if $OPT_whole;
print "Content-type: text/html\n\n" ;
print "<HEAD><TITLE>Result for query \"$pquery\"\n";
print "</TITLE></HEAD><BODY>\n";
print "<H1>Result for query \"$pquery\"</H1><HR>\n";
chdir $indexdir;
$cmd = "exec $GLIMPSE_LOC -y -n $OPT_case $OPT_whole $OPT_errors -H . " .
"$OPT_filter '$QS_query' 2>&1 |";
$gpid = open(GOUT, $cmd );
$prevfile = "";
$lcount = 0;
$fcount = 0;
line: while (<GOUT>) {
( /^([^:]*):\s*(\d+):(.*)/ ) || next;
$file = $1;
$line = $2;
$string = $3;
next unless $file =~ s|^$indexdir||o;
if ($file ne $prevfile) {
$linecount = 0;
if ($fcount>$maxfiles) {
print "<H3>Limit of $maxfiles files exceeded...</H3>\n";
$file = "";
$fcount = "at least $fcount";
$lcount = "at least $lcount";
last line;
}
print "</UL>" if ( $prevfile ne "" );
$prevfile = $file ;
print "<H3>File <A HREF=\"",$FSSERV,"/",$script,$file,
"\">/",$urlpath,$file,"</A></H3><UL>\n" ;
$fcount++ ;
}
$lcount++ ;
$linecount++;
if ($linecount>=$maxlines) {
print "<LI>Limit of $maxlines matched " .
"lines per file exceeded...\n" if
$linecount==$maxlines;
next line;
}
if ($SUPPRESS_HTML_TAGS && $file =~ /$SUPPRESS_HTML_TAGS/o) {
$string =~ s#\</?[a-zA-Z][^>\n]*\>?##g;
}
$string =~ s/\&/\&/g;
$string =~ s/\</\</g;
$string =~ s/\>/\>/g;
if ($OPT_case) {
$string =~ s#$highlight#<B>$&</B>#gio;
} else {
$string =~ s#$highlight#<B>$&</B>#go;
}
print "<LI><A HREF=\"",$FSSERV,"/",$script,$file,"?",$line,"#mfs\">\n" ;
print "line ",$line,":",$string,"</A>\n" ;
}
print "</UL>\n" if $file ;
print "<HR>" ;
print "<H2>Summary for query <code>\"",$QS_query,"\":</code></H2>\n" ;
print "found ",$lcount," matches in ",$fcount," files\n" ;
print "</BODY>\n" ;
close(GOUT);
unlink "/tmp/.glimpse_tmp.$gpid";
sub diag_exit {
# exit on error
exit 1;
}
sub err_noquery {
# The script was called without a query.
# Provide an ISINDEX type response for browsers
# without form support.
print <<'EOM' ;
Content-type: text/html
<HEAD><TITLE>Glimpse Gateway</TITLE></HEAD>
<BODY><H1>Glimpse Gateway</H1>
This is a gateway to Glimpse.
Type a pattern to search in your browser's search dialog.<P>
<ISINDEX>
<H2>What is Glimpse ?</H2>
<QUOTE>
<P>
Glimpse (which stands for GLobal IMPicit SEarch) is an
indexing and query system that allows you to search through
all your files very quickly. For example, a search for
Schwarzkopf allowing two misspelling errors in 5600 files
occupying 77MB took 7 seconds on a SUN IPC. Glimpse supports
most of agrep's options (agrep is our powerful version
of grep) including approximate matching (e.g., finding
misspelled words), Boolean queries, and even some limited
forms of regular expressions.<BR>
Glimpse's running time is typically slower than systems
tems using inverted indexes, but its index is an order of
magnitude smaller (typically 2-5% of the size of the files).
<H2>Authors of Glimpse</H2>
Udi Manber, Sun Wu, and Burra Gopal<BR>
<ADDRESS>
Department of Computer
Science, University of Arizona, Tucson, AZ 85721.<BR>
glimpse@cs.arizona.edu
</ADDRESS>
</QUOTE>
<HR>
<ADDRESS>
Paul Clark<BR>
paul@cs.arizona.edu<BR>
</ADDRESS>
</BODY>
EOM
&diag_exit;
}
sub err_noglimpse {
#
# Glimpse was not found
# Report a useful message
#
print <<'EOM' ;
Content-type: text/html
<HEAD>
<TITLE>Glimpse not found</TITLE>
</HEAD>
<BODY>
<H1>Glimpse not found</H1>
This gateway relies on <CODE>Glimpse</CODE> search tool.
If it is installed, please set the correct path in the script file.
Otherwise obtain the latest version from
<A HREF="file://ftp.cs.arizona.edu/glimpse">ftp.cs.arizona.edu</A>
</BODY>
EOM
&diag_exit;
}
sub err_noindex {
local ($indexdir) = @_;
# Glimpse index was not found
# Give recommendations for indexing
print "Content-type: text/html\n\n";
print "<HEAD>\n";
print "<TITLE>Glimpse Index not found</TITLE>\n";
print "</HEAD>\n";
print "<BODY>\n";
print "<H1>Glimpse Index in directory '$indexdir' not found</H1>\n";
print "Glimpse cannot proceed without index.\n";
print "Please check if the directory being searched is indexed\n";
print "by <code>glimpseindex</code>.\n";
print "</BODY>\n";
&diag_exit;
}
sub err_noscript {
# Glimpse archive was not found
print "Content-type: text/html\n\n";
print "<HEAD>\n";
print "<TITLE>Glimpse Archive not found</TITLE>\n";
print "</HEAD>\n";
print "<BODY>\n";
print "<H1>Glimpse Archive not found</H1>\n";
print "Cannot find script \"$script\" in config file ".
"$HTTPD_HOME/wwwlib/amgr.cfg\n";
print "</BODY>\n";
&diag_exit;
}
sub err_conf {
# Glimpse archive Configuration File was not found
print "Content-type: text/html\n\n";
print "<HEAD>\n";
print "<TITLE>Glimpse Archive Configuration File not found</TITLE>\n";
print "</HEAD>\n";
print "<BODY>\n";
print "<H1>Glimpse Archive Configuration File not found</H1>\n";
print "Cannot open configuration file $HTTPD_HOME/wwwlib/amgr.cfg\n";
print "</BODY>\n";
&diag_exit;
}
sub err_badquery {
print "Content-type: text/html\n\n";
print "<HEAD>\n";
print "<TITLE>Query is too broad</TITLE>\n";
print "</HEAD>\n";
print "<BODY>\n";
print "<H1>Query is too broad</H1>\n";
print "The query \"$pquery\" doesn't contain any words and ".
"thus will take too much time. Please refine your query.\n";
print "</BODY>\n";
&diag_exit;
}